home *** CD-ROM | disk | FTP | other *** search
- /* -*- Mode: C -*-
- * weight.c --
- * ITIID : $ITI$ $Header $__Header$
- * Author : Huynh Quoc T. Tung
- * Created On : Tue May 17 11:45:07 1994
- * Last Modified By: Ulrich Pfeifer
- * Last Modified On: Mon May 30 17:13:10 1994
- * Update Count : 35
- * Status : Unknown, Use with caution!
- */
-
- /********************* INTRODUCTION ************************
- * The documents would be presented by term vectors of the form
- * D = (t_0,w_d0; t_1,w_d1; ..., t_t,w_dt)
- * where each t_k identifies a content term assigned to some sample
- * document and w_dk represents the weight of term t_k in Document D
- * (or query Q). Thus, a typical query Q might be formulated as
- * Q = (q_0,w_q0; q_1,w_q1; ...; q_t,w_qt)
- * where q_k once again reprents a term assigned to query Q.
- * The weights could be allowed to vary continuosly between 0 and 1, the
- * higher weight assignments near 1 being used for the most important terms,
- * whereas lower weights near 0 would characterize the less important terms.
- * Given the vector representation, a query-document similarity value may
- * be obtained by comparing the corresponding vectors, using for example
- * the conventional vector product formula
- * similarity(Q,D) = sum(w_qk * w_dk), k=1 to t.
- *
- * Three factors important for term_weighting:
- * 1) term frequency in individual document (recall)
- * 2) inverse document frequency (precision)
- * 3) document length (vector length)
- *
- * Term frequency component using here: new_wgt = 0.5 + 0.5 * tf / max_tf
- * augmented normalized term frequency (tf factor normalized by maximum tf
- * in the vector, and further normalized to lie between 0.5 and 1.0).
- *
- * Collection frequency component using here: 1.0
- * no change in weight; use original term frequency component.
- *
- * Normalization component using here: sqrt(sum(new_wgt^2)) = vector length.
- *
- * Thus, document term weight is: w_dk = new_wgt / vector length
- *
- * By query term weighting it is assumpted that tf is equal 1. So that
- * w_qk = 1.
- *
- ****************************************************************************/
-
- /********************* PROCEDURE DESCRIPTION ************************
- * assign_term_weight_for_doc(max_tf, number_of_elements, bucket_ids_array, db)
- * long *max_tf;
- * long *number_of_elements;
- * long *bucket_ids_array;
- * database *db;
- *
- * computing weight and assigning it into the buf.
- *
- * write_weight_in_ptr(weight, ptr)
- * float weight;
- * char *ptr;
- *
- * assigning weight into the buf
- *
- * float read_weight_from_stream(new_weight_size, stream)
- * long new_weight_size;
- * FILE* stream;
- *
- * reading weight from stream by searching. return weight.
- *
- * void save_terms_of_doc(number_of_terms, db)
- * long *number_of_term;
- * database *db;
- *
- * save all terms of document before flushing into disk.
- *
- * void add_terms_saved(is_field, number_of_terms, doc_id, db)
- * boolean is_field;
- * long *number_of_terms;
- * long doc_id;
- * database *db;
- *
- * add all terms saved into hashtable.
- *
- ****************************************************************************/
-
- #include "irfiles.h"
- #include "cutil.h"
- #ifdef NEW_WEIGHT
- #include "futil.h"
- #include "hash.h"
- #include "irhash.h"
- #include "weight.h"
- #include <math.h>
-
- long max_term_frequency = 0; /* used in irhash.c */
- long bucket_ids_doc_array[DEFAULT_NUMBER_OF_BUCKETS + 1]; /* used in hash.c */
-
- void write_weight_in_ptr(weight, ptr)
- float weight;
- unsigned char *ptr;
- {
- float tmp_weight[1];
-
- tmp_weight[0] = weight;
- memcpy((unsigned char *)ptr, (unsigned char *)tmp_weight,NEW_WEIGHT_SIZE);
- }
-
- float read_weight_from_stream(new_weight_size, stream)
- long new_weight_size;
- FILE *stream;
- {
- float tmp;
- unsigned char *inc;
- int i;
-
- inc = (unsigned char*) &tmp;
- for (i=0; i<new_weight_size; i++) {
- *inc = fgetc(stream);
- inc++;
- }
- return(tmp);
- }
-
- void assign_term_weight_for_doc(number_of_elements, db)
- long *number_of_elements;
- database *db;
- {
- long i;
- long id;
- long tf;
- float new_wgt;
- float doc_len = 0.0;
- hashtable *htable = db->the_word_memory_hashtable;
-
- /* compute document length */
- for(i=0; i< *number_of_elements; i++) {
- id = bucket_ids_doc_array[i];
- tf = (htable->contents)[id].occurances_in_doc;
- doc_len += (0.5 + (0.5 * tf / max_term_frequency)) * (0.5 + (0.5 * tf / max_term_frequency));
- }
- doc_len = sqrt(doc_len);
- for(i=0; i< *number_of_elements; i++) {
- id = bucket_ids_doc_array[i];
- tf = (htable->contents)[id].occurances_in_doc;
- (htable->contents)[id].occurances_in_doc = 0;
- new_wgt = (0.5 + (0.5 * tf / max_term_frequency)) / doc_len;
- if ((htable->contents)[id].current_memory_ptr == NULL) {
- fprintf(stderr, "panic: assign_term_weight_for_doc current_memory_ptr == NULL\n");
- } else {
- write_weight_in_ptr(new_wgt,
- (htable->contents)[id].current_memory_ptr - NEW_WEIGHT_SIZE);
- }
- }
- max_term_frequency = 0;
- *number_of_elements = 0;
- memset(bucket_ids_doc_array, 0, DEFAULT_NUMBER_OF_BUCKETS * sizeof(long));
- }
-
- term_infotable *termtable = NULL;
-
- void save_terms_for_doc(number_of_terms, db)
- long *number_of_terms;
- database *db;
- {
- int info_size = CHARACTER_POSITION_SIZE;
- int cn_size = CHARACTER_POSITION_SIZE + NEW_WEIGHT_SIZE;
- long i, id, char_pos;
- hashtable *htable = db->the_word_memory_hashtable;
-
- if(*number_of_terms != 0) {
- if(termtable == NULL)
- termtable = (term_infotable *)s_malloc(sizeof(term_infotable) * *number_of_terms);
- if(termtable == NULL)
- panic("Out of memory");
- }
-
- for(i=0; i < *number_of_terms; i++) {
- id = bucket_ids_doc_array[i];
- if((termtable[i].term = (char*)s_malloc(sizeof(char) * (MAX_KEY_SIZE+1))) == NULL)
- panic("Out of memory");
- strncpy(termtable[i].term, (htable->contents)[id].key, MAX_KEY_SIZE);
- termtable[i].char_pos =
- read_bytes_from_memory(CHARACTER_POSITION_SIZE,
- (htable->contents)[id].current_memory_ptr - cn_size);
- termtable[i].tf = (htable->contents)[id].occurances_in_doc;
- if((htable->contents)[id].memory_size - WORD_MEMORY_INIT_BLOCK_SIZE == 0)
- (htable->contents)[id].number_of_occurances = STOP_WORD_FLAG;
- else (htable->contents)[id].current_memory_ptr -= WORD_MEMORY_INIT_BLOCK_SIZE;
- }
- *number_of_terms = 0;
- }
-
- void add_terms_saved(is_field, number_of_terms, doc_id, db)
- boolean is_field;
- long *number_of_terms;
- long doc_id;
- database *db;
- {
- long i, id;
- long number_of_elements = *number_of_terms;
- *number_of_terms = 0;
- for(i=0; i < number_of_elements; i++) {
- if(is_field)
- field_add_word(termtable[i].term, termtable[i].char_pos,0,1,doc_id,0,0,db,false);
- else
- add_word(termtable[i].term, termtable[i].char_pos,0,1,doc_id,0,0,db,false);
- id = bucket_ids_doc_array[i];
- (db->the_word_memory_hashtable->contents)[id].occurances_in_doc = termtable[i].tf;
- if(termtable[i].term != NULL)
- s_free(termtable[i].term);
- }
- if(termtable != NULL)
- s_free(termtable);
- }
-
-
- #endif /* NEW_WEIGHT */
-